from docx import Document

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = []

    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
        
    for table in doc.tables:
        lo = {} # 存储每一行去重后的数据
        for row in range(0, len(table.rows)):
            row_list = []
            for col in range(0,len(table.row_cells(row))):  # 提取row行的全部列数据
                row_list.append(table.cell(row,col).text.replace('\n','').replace(' ', '')) # 去除字符串中的特殊字符，并添加到临时列表中
            lo[row] = (sorted(set(row_list), key = row_list.index)) # 在不变顺序的前提下，去除List中的重复项
            
            # 打印出每行的数据观察相关格式特征
            print(row, ":len(", len(lo[row]),'):',lo[row])

    return ''.join(text)

file_path = r'C:\Users\houzh\Downloads\2023-12-30 北京市教委报告-0431.docx'
text = extract_text_from_docx(file_path)
print(text)
